In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import json
from operator import itemgetter
from collections import Counter
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from statsmodels.graphics.mosaicplot import mosaic
import seaborn as sns
from colour import Color
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from lime.lime_tabular import LimeTabularExplainer
from sklearn.metrics import classification_report, confusion_matrix

Loading all the data from the json file.

In [10]:
def iter_dataset():
    with open(filename,'rt') as fid:
        for line in fid:
            example = json.loads(line)
            yield(example['cms_prescription_counts'],example['provider_variables'])
            

The data is in json format. The data is saved as a list of tuples. Each tuple consists of two dictionaries. The first dictionary provides information about the prescriptions and their counts, and the second dictionary contain information about prescription provider.

I am analyzing only those practitioners who have prescribed more than 50 drugs.

In [11]:
filename = 'roam_prescription_based_prediction.jsonl'
data = [(phi_dict, y_dict) for phi_dict, y_dict in iter_dataset()
       if len(phi_dict)>50]
print ('Example data :\n')
data[0]
Example data :

Out[11]:
({'ABILIFY': 11,
  'ALLOPURINOL': 86,
  'ALPRAZOLAM': 45,
  'AMLODIPINE BESYLATE': 175,
  'AMLODIPINE BESYLATE-BENAZEPRIL': 12,
  'ATENOLOL': 62,
  'ATENOLOL-CHLORTHALIDONE': 53,
  'ATORVASTATIN CALCIUM': 19,
  'AZITHROMYCIN': 18,
  'BENAZEPRIL HCL': 11,
  'BUMETANIDE': 53,
  'BYSTOLIC': 14,
  'CALCITRIOL': 79,
  'CALCIUM ACETATE': 85,
  'CARISOPRODOL': 40,
  'CARVEDILOL': 36,
  'CIPROFLOXACIN HCL': 19,
  'CLONIDINE HCL': 43,
  'CLOPIDOGREL': 22,
  'DIAZEPAM': 24,
  'DIOVAN': 20,
  'DOXAZOSIN MESYLATE': 26,
  'FENOFIBRATE': 14,
  'FUROSEMIDE': 162,
  'GABAPENTIN': 35,
  'GLYBURIDE': 16,
  'HYDRALAZINE HCL': 50,
  'HYDROCHLOROTHIAZIDE': 59,
  'HYDROCODONE-ACETAMINOPHEN': 64,
  'IRBESARTAN': 11,
  'ISOSORBIDE MONONITRATE ER': 13,
  'KLOR-CON M10': 20,
  'LABETALOL HCL': 28,
  'LANTUS': 20,
  'LEVOTHYROXINE SODIUM': 12,
  'LISINOPRIL': 44,
  'LISINOPRIL-HYDROCHLOROTHIAZIDE': 19,
  'LOSARTAN POTASSIUM': 41,
  'LOSARTAN-HYDROCHLOROTHIAZIDE': 14,
  'LOVASTATIN': 11,
  'MEGESTROL ACETATE': 11,
  'MELOXICAM': 29,
  'METOLAZONE': 73,
  'METOPROLOL SUCCINATE': 104,
  'METOPROLOL TARTRATE': 93,
  'MIDODRINE HCL': 12,
  'MINOXIDIL': 14,
  'NEXIUM': 44,
  'NIFEDICAL XL': 32,
  'NIFEDIPINE ER': 51,
  'NOVOLOG': 12,
  'OMEPRAZOLE': 35,
  'OXYBUTYNIN CHLORIDE': 11,
  'OXYCODONE HCL': 51,
  'PANTOPRAZOLE SODIUM': 13,
  'POTASSIUM CHLORIDE': 30,
  'PRAVASTATIN SODIUM': 13,
  'PREDNISONE': 40,
  'RANITIDINE HCL': 21,
  'RENVELA': 117,
  'SENSIPAR': 94,
  'SERTRALINE HCL': 14,
  'SIMVASTATIN': 14,
  'SPIRONOLACTONE': 50,
  'TAMSULOSIN HCL': 14,
  'TEMAZEPAM': 41,
  'TRAMADOL HCL': 11,
  'ZOLPIDEM TARTRATE': 41},
 {'brand_name_rx_count': 384,
  'gender': 'M',
  'generic_rx_count': 2287,
  'region': 'South',
  'settlement_type': 'non-urban',
  'specialty': 'Nephrology',
  'years_practicing': 7})

Logistic Regression

In this section we will perform logistion regression. We will use the drugs that are prescribed by the practitioners to predict targets. In our case we will use 'region', 'gender' and 'specialty' as the target, as see how well logistic regression does at predicting them. I am using gridsearch to choose the best parameters for classification.

In [44]:
def cross_validate(X,y,mod,param_grid={},n_fold=5):
    
    scores = np.zeros(n_fold)
    models = []
    for i in range(n_fold):
        X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1)
        best_mod = mod
        cross_val = GridSearchCV(mod,param_grid=param_grid,cv=n_fold,n_jobs=-1)
        cross_val.fit(X_train,y_train)
        best_mod = cross_val.best_estimator_
        best_mod.fit(X_train,y_train)
        
        predictions = best_mod.predict(X_test)
        c_report = classification_report(y_true=y_test,y_pred=predictions)
        con_mat = confusion_matrix(y_true=y_test,y_pred=predictions)
        score = best_mod.score(X_test,y_test)
        scores[i] = score
        models.append(best_mod)
        best_mod_summary = {k:best_mod.get_params()[k] for k in param_grid}
        print("Fold {0:} score {1:0.03f} best parameters:{2:}".format(
        (i+1),score,best_mod_summary))
    return (scores,models,c_report,con_mat,X_test,y_test)

For this analysis I select only those specialties for which there are >=500 practitioners. Thus during regression there will be enough samples for the model to learn from them.

In total there are 113 specialties. However, if you notice carefully many specialties are very similar or may be even sub-category of a given specialty. But setting a threshold of 100, we limit the analysis to top 23 specialties only.

In [12]:
#Selecting a subset of data where there are atleast 500 practitioners for a specialty
feats, ys = zip(*data)
ys = pd.DataFrame(list(ys))#preparing targets
specialty_count = ys['specialty'].value_counts()
print("Total number of specialties in dataset = ",len(specialty_count))
specialty_subset = [x for x,c in specialty_count.items() if c>=500 ]
print("Number of specialties with 100 or more practitioners = ",len(specialty_subset))

data_subset = [(x_dict, y_dict) for x_dict,y_dict in data if y_dict['specialty'] in (specialty_subset)]

feats, ys = zip(*data_subset)
ys = pd.DataFrame(list(ys))#preparing targets

vectorizer = DictVectorizer(sparse=True)
X = vectorizer.fit_transform(feats)
X = TfidfTransformer().fit_transform(X)#preparing features
Total number of specialties in dataset =  113
Number of specialties with 100 or more practitioners =  12

Test case:

Performing logistic regression using region as targets.

In [48]:
from sklearn.linear_model import LogisticRegression
region_score, region_model, region_class_report, region_conf_mat, X_test, y_test = cross_validate(X,ys['region'],LogisticRegression(solver='lbfgs',multi_class='multinomial'),param_grid={'C':np.logspace(-1,3,4)})
Fold 1 score 0.738 best parameters:{'C': 2.1544346900318834}
Fold 2 score 0.719 best parameters:{'C': 2.1544346900318834}
Fold 3 score 0.725 best parameters:{'C': 2.1544346900318834}
Fold 4 score 0.724 best parameters:{'C': 2.1544346900318834}
Fold 5 score 0.720 best parameters:{'C': 46.415888336127772}
In [49]:
def run_all_experiments(X,ys,mod,targets=('gender','region','specialty'),
                       param_grid={},n_fold=5):
    summary = []
    for target in targets:
        y = ys[target]
        this_summary = {'model':mod.__class__.__name__,
                       'target':target,'n_classes':len(set(y))}
        print ("CURRENT TARGET : {}".format(target))
        scores_t,models_t, class_report_t, conf_mat_t, X_test, y_test = cross_validate(X,y,mod,param_grid=param_grid,
                                                                                       n_fold=n_fold)
        
        for i,s in enumerate(scores_t,start=1):
            this_summary['score{}'.format(i)] = s
        this_summary['score_mean'] = scores_t.mean()
        this_summary['models'] = models_t
        this_summary['class_report'] = class_report_t
        this_summary['conf_matrix'] = conf_mat_t
        this_summary['X_test'] = X_test
        this_summary['y_test'] = y_test
        summary.append(this_summary)
    return(summary)

Performing logistic regression on all the targets

In [50]:
all_exp_log = run_all_experiments(X,ys,LogisticRegression(solver='lbfgs',multi_class='multinomial'),
                                 param_grid={'C':np.logspace(-1,3,4)})

#Saving the data for future use.
import pickle
with open('all_exp_log.pkl','wb') as fid:
    pickle.dump(all_exp_log,fid)
    #all_exp_log = pickle.load(fid)
    
CURRENT TARGET : gender
Fold 1 score 0.784 best parameters:{'C': 2.1544346900318834}
Fold 2 score 0.805 best parameters:{'C': 2.1544346900318834}
Fold 3 score 0.785 best parameters:{'C': 2.1544346900318834}
Fold 4 score 0.797 best parameters:{'C': 2.1544346900318834}
Fold 5 score 0.789 best parameters:{'C': 2.1544346900318834}
CURRENT TARGET : region
Fold 1 score 0.714 best parameters:{'C': 2.1544346900318834}
Fold 2 score 0.720 best parameters:{'C': 2.1544346900318834}
Fold 3 score 0.726 best parameters:{'C': 46.415888336127772}
Fold 4 score 0.732 best parameters:{'C': 46.415888336127772}
Fold 5 score 0.727 best parameters:{'C': 2.1544346900318834}
CURRENT TARGET : specialty
Fold 1 score 0.810 best parameters:{'C': 2.1544346900318834}
Fold 2 score 0.802 best parameters:{'C': 2.1544346900318834}
Fold 3 score 0.814 best parameters:{'C': 2.1544346900318834}
Fold 4 score 0.831 best parameters:{'C': 2.1544346900318834}
Fold 5 score 0.817 best parameters:{'C': 2.1544346900318834}

The table below shows the results of Logistic regression for all the three targets(region, gender and specialty). It is interesting to observe that based on the drugs prescribed it seems like it is possible to determine the gender of the practitioner. This is only because, there is a strong correlation between gender and the specialty, and not because male and female practitioners have any preference for drugs prescribed.

Below, we will look into the specialties in more detail.

In [2]:
import pickle
with open('all_exp_log.pkl','rb') as fid:
    all_exp_log = pickle.load(fid)
In [3]:
all_target_table = pd.DataFrame(all_exp_log)
all_target_table
Out[3]:
X_test class_report conf_matrix model models n_classes score1 score2 score3 score4 score5 score_mean target y_test
0 (0, 57)\t0.0247383976546\n (0, 87)\t0.09952... precision recall f1-score s... [[645, 313], [207, 1299]] LogisticRegression [LogisticRegression(C=2.1544346900318834, clas... 2 0.783685 0.805195 0.785308 0.797484 0.788961 0.792127 gender 1006 F 8801 M 9445 M 24203 F 23...
1 (0, 24)\t0.0898739744446\n (0, 57)\t0.09541... precision recall f1-score s... [[238, 45, 170, 32], [48, 424, 66, 21], [73, 5... LogisticRegression [LogisticRegression(C=2.1544346900318834, clas... 4 0.714286 0.719562 0.725649 0.731737 0.726867 0.723620 region 12373 South 12672 Midwest 4103 ...
2 (0, 35)\t0.103765868315\n (0, 57)\t0.064381... precisio... [[6, 4, 2, 48, 7, 5, 4, 1, 3, 3, 0, 0], [0, 50... LogisticRegression [LogisticRegression(C=2.1544346900318834, clas... 12 0.809659 0.801948 0.814123 0.831169 0.816964 0.814773 specialty 3974 Family ...
In [4]:
# This is only for the specialty.
target_summary = next(d for d in all_exp_log if d['target']=='specialty')   
print(target_summary['class_report'])
                                      precision    recall  f1-score   support

                        Adult Health       0.29      0.07      0.12        83
              Cardiovascular Disease       0.95      0.96      0.96       526
Endocrinology, Diabetes & Metabolism       0.93      0.91      0.92       141
                              Family       0.62      0.82      0.71       481
                  Geriatric Medicine       0.75      0.82      0.79       194
                  Infectious Disease       0.76      0.78      0.77        60
                             Medical       0.30      0.13      0.18       202
                          Nephrology       0.97      0.96      0.96       180
                           Neurology       0.96      0.99      0.97       172
                          Psychiatry       0.96      1.00      0.98       283
                   Pulmonary Disease       0.86      0.78      0.82        64
                        Rheumatology       1.00      0.92      0.96        78

                         avg / total       0.79      0.82      0.80      2464

Based on the classification report, it seems that specialities of many practitioners could be accurately predcited (Overall f1-score of 0.8, with many classes > 0.7 and almost upto 0.98). However, there are some specialties(Adult Health and Medical) that have a very poor prediction score. Let look at the confusion matrix to get an idea why this would be the case.

Using Bokeh for Interactive Plot

In [5]:
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource,HoverTool
from bokeh.palettes import Oranges3 as palette
from bokeh.io import output_notebook
In [6]:
spec_names = target_summary['models'][0].classes_.tolist()
alpha_max = np.max(target_summary['conf_matrix'])
total_spec = target_summary['conf_matrix'].sum(axis=1)
xname = []
yname = []
alpha = []

for i, t1_name in enumerate(spec_names):
    for j,t2_name in enumerate(spec_names):
        xname.append(t1_name)
        yname.append(t2_name)
        #cell_weight=(target_summary['conf_matrix'][i][i]+target_summary['conf_matrix'][j][j])/2
        alpha.append((target_summary['conf_matrix'][i][j]/total_spec[i]))
target_summary['conf_matrix'].shape
source = ColumnDataSource(data=dict(xname=xname,yname=yname,alphas=alpha,
                                    count=target_summary['conf_matrix'].flatten(),))
In [7]:
output_notebook()
Loading BokehJS ...
In [8]:
p = figure(title="Confusion matrix",
           x_axis_location="above", tools="hover,save",
           x_range=list(reversed(spec_names)), y_range=spec_names)
p.plot_width = 700
p.plot_height = 700
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "9pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = np.pi/3
p.title.text_font_size = '18pt'
p.rect('xname', 'yname', 1.0, 1.0, source=source, alpha='alphas',hover_line_color='black',
      fill_color='#c51b8a')

p.select_one(HoverTool).tooltips = [
    ('spec_names', '@yname, @xname'),
    ('count', '@count'),]
#637939, e6550d,c51b8a
In [9]:
show(p)

The confusion matrix now provides more idea why the f1-score for 'Adult-Health' and 'Medical' are so low. It seems that in case of many practitioners having specialty in these classes, they are being wrongly predicted as 'Family'!!!

Next, we will use LIME to understand the reason behind this mis-classification. More specifically, we will look at random individual examples for these categories. Using the weights obtained during the Logistic regression in conjugation with the drug prescribed by these individual practitioner will provide an idea behind the poor f1-scores.

Local Interpretable Model-Agnostic Explanations (LIME)

In the section below I will use LIME to understand the classification made by logistic regression.

Let's first look at a specialty that is predicted very robustly by Logistic regression.

In [13]:
#target summary already exists
X_toarray = target_summary['X_test'].toarray()

#Pick any one of the models that is in the target summary
model_t = target_summary['models'][0]
explainer = LimeTabularExplainer(X_toarray,feature_names=vectorizer.get_feature_names(),
                                class_names = model_t.classes_, discretize_continuous=True)

#pick a sample from the datasets to perform LIME on
samples = np.random.choice([i for i,cls in enumerate(target_summary['y_test'].values) if cls=='Psychiatry'],size=5,replace=None)
#samples = [4085,22447,11688]
explain_all = []
for sample in samples:
    print ("Sample number = ",sample)
    print ("True Specialty = ",target_summary['y_test'].iloc[sample])
    #run LIME
    exp = explainer.explain_instance(X_toarray[sample],model_t.predict_proba,
                                num_features=5,top_labels=1)
    explain_all.append({'sample':exp})
    exp.show_in_notebook(show_table=True, show_all=False)
    #exp.save_to_file('sample_'+str(sample)+'.html')
    exp.save_to_file(target_summary['y_test'].iloc[sample]+str(sample)+'.html')


    
Sample number =  361
True Specialty =  Psychiatry
Sample number =  295
True Specialty =  Psychiatry
Sample number =  400
True Specialty =  Psychiatry
Sample number =  802
True Specialty =  Psychiatry
Sample number =  2296
True Specialty =  Psychiatry

LIME calculates the probabilty of the sample belonging to a particular class by perturbing the values of the features and then estimating how this affects the probabilites of belonging to a class.

For the specialty 'Psychiatry', which is predicted very robust, I test 5 random samples. The right-most table shows the features that were used to make the classification and thier values. One thing to notice is that most of the prescription drugs(features) that were used by LIME to make the prediction are very similiar for this specialty(e.g. 'CLONAZEPAM','QUETIAPINE FUMARATE' etc).

In [14]:
#pick a sample from the datasets to perform LIME on
samples = np.random.choice([i for i,cls in enumerate(target_summary['y_test'].values) if cls=='Adult Health'],size=5,replace=None)
explain_all = []
for sample in samples:
    print ("Sample number = ",sample)
    print ("True Specialty = ",target_summary['y_test'].iloc[sample])
    #run LIME
    exp = explainer.explain_instance(X_toarray[sample],model_t.predict_proba,
                                num_features=5,top_labels=1)
    explain_all.append({'sample':exp})
    exp.show_in_notebook(show_table=True, show_all=False)
    #exp.save_to_file('sample_'+str(sample)+'.html')
    exp.save_to_file(target_summary['y_test'].iloc[sample]+str(sample)+'.html')
Sample number =  1699
True Specialty =  Adult Health
Sample number =  1027
True Specialty =  Adult Health
Sample number =  2436
True Specialty =  Adult Health
Sample number =  137
True Specialty =  Adult Health
Sample number =  114
True Specialty =  Adult Health

Above, I have done the LIME analysis for 5 samples randomly choosen from the specialty 'Adult Health'. The right-most table shows that the drugs(features) prescribed by practitioners belonging to this class vary a lot. Also, the prediction probabilities for these practitioners to belong to 'Adult Health' are not very high, and in many cases it is either the second or third most probable specialty.

In [ ]: